This report shows the QC steps for gene expression microarry data from GEO study, including:
Change the variables here
# GEO id
geo_id="GSE4917"
# direcotry stores GEO data and phenotype file
datadir="C:/Users/mengykan/Projects/AsthmaApp/data/microarray"
resdir="C:/Users/mengykan/Projects/AsthmaApp/res/microarray"
Install the prerequisite R packages
source("http://bioconductor.org/biocLite.R")
biocLite("GEOquery")
biocLite("oligo")
biocLite("affy") # for Affymetrix microarray-specific QC analysis
biocLite("viridis")
install.packages("ggplot2")
install.packages("gplots")
install.packages("Hmisc")
install.packages("devtools")
install.package("dplyr")
install.package("pander")
Load the necessary libraries
library(GEOquery)
library(oligo)
library(viridis) # heatmap colour
library(ggplot2)
library(gplots) # heatmap.2 plot
library(Hmisc) # compute hoeffd (Hoeffding's D statistics) for MA plot
library(devtools) # compuate PCs
library(pander)
library(dplyr)
This step needs inspection as phenotype data provided by researchers are different. In general, the information of sample id, sample geo id, tissue, disease status, treatment status (if applied) is required. Addition information includes age and gender.
Load a GDS file with GEOquery. Download from GEO server if the data does not exist
If ths study was performed in multiple platforms, choose the samples using Affymetrix Human Genome U133 (Plus 2.0) Array
# check if GEO matrix file is existed
geo_fn <- list.files(path=datadir)[grepl(geo_id,list.files(path=datadir))&grepl("matrix.txt.gz$",list.files(path=datadir))]
if (length(geo_fn)==0) { # matrix files are alreadly downloaded
gselms <- getGEO(geo_id, destdir=datadir,GSEMatrix = TRUE) # dowanload matrix file
print(attr(gselms, "names"))
if (length(gselms)>1) { # multiple platform
message("This study was performed in multiple platforms")
idx <- grep("GLP96|GPL570", attr(gselms, "names")) # only use data from GPL96
gse <- gselms[[idx]]
} else {gse <- gselms[[1]]}
} else if (length(geo_fn)==1) {
print(geo_fn)
gse <- getGEO(filename=paste0(datadir,"/",geo_fn),GSEMatrix = TRUE)
} else { # multiple platform
print(geo_fn)
message("This study was performed in multiple platforms")
geo_fn <- geo_fn[grep("GLP96|GPL570",geo_fn)]
gse <- getGEO(filename=paste0(datadir,"/",geo_fn),GSEMatrix = TRUE)
}
## [1] "GSE4917_series_matrix.txt.gz"
Primary Check for phenotype information.
pheno.df <- pData(phenoData(gse))
names(pheno.df)
## [1] "title" "geo_accession"
## [3] "status" "submission_date"
## [5] "last_update_date" "type"
## [7] "channel_count" "source_name_ch1"
## [9] "organism_ch1" "characteristics_ch1"
## [11] "characteristics_ch1.1" "treatment_protocol_ch1"
## [13] "molecule_ch1" "extract_protocol_ch1"
## [15] "label_ch1" "label_protocol_ch1"
## [17] "taxid_ch1" "hyb_protocol"
## [19] "scan_protocol" "description"
## [21] "data_processing" "platform_id"
## [23] "contact_name" "contact_email"
## [25] "contact_phone" "contact_laboratory"
## [27] "contact_department" "contact_institute"
## [29] "contact_address" "contact_city"
## [31] "contact_state" "contact_zip/postal_code"
## [33] "contact_country" "supplementary_file"
## [35] "data_row_count"
dim(pheno.df)
## [1] 24 35
rm(gse) # remove this object as it will not be used
Show all the phenotype data from GEO study. If there are more than 10 levels of variables in a column, only show the first five levels of variables.
for (i in 1:ncol(pheno.df)) {
if (length(levels(pheno.df[,i]))>=10) {
res <- pheno.df[pheno.df[,i]%in%levels(pheno.df[,i])[1:5],]
res[,i] <- factor(res[,i])
} else {res <- pheno.df}
res <- data.frame(table(res[,i]))
names(res) <- c(names(pheno.df)[i],"counts")
pandoc.table(res,justify = 'left',split.table = Inf)
}
| title | counts |
|---|---|
| Control(ethanol)-treated sample at 24hr, biological rep1 | 1 |
| Control(ethanol)-treated sample at 24hr, biological rep2 | 1 |
| Control(ethanol)-treated sample at 24hr, biological rep3 | 1 |
| Control(ethanol)-treated sample at 2hr, biological rep1 | 1 |
| Control(ethanol)-treated sample at 2hr, biological rep2 | 1 |
| geo_accession | counts |
|---|---|
| GSM109204 | 1 |
| GSM109205 | 1 |
| GSM109206 | 1 |
| GSM109207 | 1 |
| GSM109208 | 1 |
| status | counts |
|---|---|
| Public on Jun 02 2006 | 24 |
| submission_date | counts |
|---|---|
| May 17 2006 | 24 |
| last_update_date | counts |
|---|---|
| Apr 24 2013 | 24 |
| type | counts |
|---|---|
| RNA | 24 |
| channel_count | counts |
|---|---|
| 1 | 24 |
| source_name_ch1 | counts |
|---|---|
| MCF10A-Myc cells treated with dexamethasone for 24hr | 3 |
| MCF10A-Myc cells treated with dexamethasone for 2hr | 3 |
| MCF10A-Myc cells treated with dexamethasone for 30 min | 2 |
| MCF10A-Myc cells treated with dexamethasone for 30min | 1 |
| MCF10A-Myc cells treated with dexamethasone for 4hr | 3 |
| organism_ch1 | counts |
|---|---|
| Homo sapiens | 24 |
| characteristics_ch1 | counts |
|---|---|
| Genotype: unknown | 24 |
| characteristics_ch1.1 | counts |
|---|---|
| Age: 35 | 24 |
| treatment_protocol_ch1 | counts |
|---|---|
| MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone (Dex) for 24hr. | 1 |
| MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 2 hr. | 2 |
| MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 24 hr. | 2 |
| MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 2hr. | 1 |
| MCF10A-Myc cells were subjected to 72 hrs of growth factor withdrawal followed by treatment with dexamethasone for 30 min. | 3 |
| molecule_ch1 | counts |
|---|---|
| total RNA | 24 |
| extract_protocol_ch1 | counts |
|---|---|
| Total RNA from each sample was extracted using Qiagenâ RNeasy Kit. | 24 s |
| label_ch1 | counts |
|---|---|
| biotin | 24 |
| label_protocol_ch1 | counts |
|---|---|
| Biotinylated cRNA were prepared according to the standard Affymetrix protocol from 6 microg total RNA (Expression Analysis Technical Manual, 2001, Affymetrix). | 24 |
| taxid_ch1 | counts |
|---|---|
| 9606 | 24 |
| hyb_protocol | counts |
|---|---|
| Following fragmentation, 10 microg of cRNA were hybridized for 16 hr at 45C on Affymetrix human genome HG-U133A genechip. GeneChips were washed and stained in the Affymetrix Fluidics Station 450. | 24 |
| scan_protocol | counts |
|---|---|
| GeneChips were scanned using the Affymetrix GeneChip® Scanner 3000 7G. | 19 |
| GeneChips were scanned using the Affymetrix GeneChip® Scanner 3000 7G.. | 5 |
| description | counts |
|---|---|
| Gene expression data from embryos in slow phase of cellularisation. | 1 |
| Gene expression data from MCF10A-Myc cells treated with dex for 2 hr. | 1 |
| Gene expression data from MCF10A-Myc cells treated with Dex for 2 hr. | 1 |
| Gene expression data from MCF10A-Myc cells treated with dex for 24 hr. | 1 |
| Gene expression data from MCF10A-Myc cells treated with Dex for 24 hr. | 1 |
| data_processing | counts |
|---|---|
| The data were analyzed and normalized Robust Multi-array Average (RMA) algorithm. | 24 |
| platform_id | counts |
|---|---|
| GPL96 | 24 |
| contact_name | counts |
|---|---|
| Suzanne,Daniela,Conzen | 24 |
| contact_email | counts |
|---|---|
| sconzen@medicine.bsd.uchicago.edu | 24 |
| contact_phone | counts |
|---|---|
| (773)834-2604 | 24 |
| contact_laboratory | counts |
|---|---|
| Conzen Lab | 24 |
| contact_department | counts |
|---|---|
| Medicine | 24 |
| contact_institute | counts |
|---|---|
| The University of Chicago | 24 |
| contact_address | counts |
|---|---|
| 5841 S. Maryland Ave, SBRI J301, MC 2115 | 24 |
| contact_city | counts |
|---|---|
| Chicago | 24 |
| contact_state | counts |
|---|---|
| IL | 24 |
| contact_zip/postal_code | counts |
|---|---|
| 60637 | 24 |
| contact_country | counts |
|---|---|
| USA | 24 |
| data_row_count | counts |
|---|---|
| 22215 | 24 |
Select required phenotype variables from the primary check and generate standardized columns. This is a manual step.
cols <- c("title","geo_accession","source_name_ch1")
pheno.new <- pheno.df %>%
dplyr::select(cols) %>%
dplyr::mutate(GEO_ID=geo_accession) %>%
dplyr::mutate(Subject=gsub("^.*biological (.*)$","\\1",title)) %>%
dplyr::mutate(Sample=paste(geo_accession,Subject,sep="_")) %>%
dplyr::mutate(Tissue="MCF10A-Myc") %>%
dplyr::mutate(treatment_time=gsub("^.*for (\\d+.*)$","\\1",source_name_ch1)) %>%
dplyr::mutate(treatment_time=gsub(" ","",treatment_time)) %>%
dplyr::mutate(treatment_drug=ifelse(grepl("dexamethasone",source_name_ch1),"dex","baseline")) %>%
dplyr::mutate(Treatment=paste(treatment_drug,treatment_time,sep="_")) %>%
dplyr::mutate(Age="35") %>%
dplyr::mutate_if(is.character,as.factor)
pheno.new <- pheno.new[,(length(cols)+1):ncol(pheno.new)]
write.table(pheno.new,paste0(datadir,"/",geo_id,"_Phenotype_withoutQC.txt"),col.names=T,row.names=F,sep="\t",quote=F)
rm(pheno.new)
Check phenotype table and the sample size for different groups
# read in the phenotype file if it was created before
pheno <-read.table(paste0(datadir,"/",geo_id,"_Phenotype_withoutQC.txt"),header=T, sep="\t")
pandoc.table(pheno[1:5,],split.table=Inf) # show first 5 rows
| GEO_ID | Subject | Sample | Tissue | treatment_time | treatment_drug | Treatment | Age |
|---|---|---|---|---|---|---|---|
| GSM109204 | rep1 | GSM109204_rep1 | MCF10A-Myc | 30min | baseline | baseline_30min | 35 |
| GSM109205 | rep1 | GSM109205_rep1 | MCF10A-Myc | 30min | dex | dex_30min | 35 |
| GSM109206 | rep1 | GSM109206_rep1 | MCF10A-Myc | 2hr | baseline | baseline_2hr | 35 |
| GSM109207 | rep1 | GSM109207_rep1 | MCF10A-Myc | 2hr | dex | dex_2hr | 35 |
| GSM109208 | rep1 | GSM109208_rep1 | MCF10A-Myc | 4hr | baseline | baseline_4hr | 35 |
avail_group=c("Tissue","Disease","Treatment")[c("Tissue","Disease","Treatment")%in%names(pheno)]
res=as.data.frame(table(pheno[,avail_group]))
names(res) <- c(avail_group,"Frequency")
pandoc.table(res,split.table=Inf)
| Tissue | Treatment | Frequency |
|---|---|---|
| MCF10A-Myc | baseline_24hr | 3 |
| MCF10A-Myc | baseline_2hr | 3 |
| MCF10A-Myc | baseline_30min | 3 |
| MCF10A-Myc | baseline_4hr | 3 |
| MCF10A-Myc | dex_24hr | 3 |
| MCF10A-Myc | dex_2hr | 3 |
| MCF10A-Myc | dex_30min | 3 |
| MCF10A-Myc | dex_4hr | 3 |
Quality control includes:
The general QC steps and outlier detection methods are derived from arrayQualityMetrics
Detailed description of QC steps for microarray data can be found here
Download raw data files (.cel) if the data folder does not exist.
celfiles <- paste0(datadir,"/",geo_id,"/data/",gsub("^.*/(.*)$","\\1",pheno.df$supplementary_file)) # get .cel file names to read in. This step is important for samples scanned in different platforms
if (!all(celfiles%in%list.files(path=paste0(datadir,"/",geo_id,"/data"),pattern=".CEL.gz|.cel.gz",full.names=T))) { # if cel files are not downloaded or not all the sample cel files exsist
getGEOSuppFiles(geo_id,baseDir=datadir) #download GEO files
untar(paste0(datadir,"/",geo_id,"/",geo_id,"_RAW.tar"), exdir=paste0(datadir,"/",geo_id,"/data")) # extract the zip file
}
Read in raw data. Generate raw data object under ExpressionFeatureSet (oligo class).
raw.data <- read.celfiles(celfiles)
Obtain scan date from raw data used for batch information
# As the scan date and scan time are usually joined by "T" or a white space, use both pattern to split the date with time
pheno$ScanDate_Group <- sapply(strsplit(as.character(protocolData(raw.data)$dates), "T| "), function(x) {x[[1]]})
pheno$ScanDate_Group <- as.factor(pheno$ScanDate_Group)
# assign colours to Scan Date for plots
colours=c("#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666", "#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3", "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD", "#CCEBC5", "#FFED6F") # first 8 colour names derived from Dark2, and last 12 names from Set3
i=length(levels(pheno$ScanDate_Group))
colour_list <- colours[1:i]
names(colour_list) <- levels(pheno$ScanDate_Group) # colour to corresponding scan date
# assign pheno type data to raw expression data
pData(raw.data) <- pheno
row.names(pData(raw.data)) <- sampleNames(protocolData(raw.data))
pandoc.table(pData(raw.data)[1:5,],split.table = Inf) # show first 5 rows
| GEO_ID | Subject | Sample | Tissue | treatment_time | treatment_drug | Treatment | Age | ScanDate_Group | |
|---|---|---|---|---|---|---|---|---|---|
| GSM109204.cel.gz | GSM109204 | rep1 | GSM109204_rep1 | MCF10A-Myc | 30min | baseline | baseline_30min | 35 | 05/23/02 |
| GSM109205.cel.gz | GSM109205 | rep1 | GSM109205_rep1 | MCF10A-Myc | 30min | dex | dex_30min | 35 | 05/23/02 |
| GSM109206.cel.gz | GSM109206 | rep1 | GSM109206_rep1 | MCF10A-Myc | 2hr | baseline | baseline_2hr | 35 | 05/23/02 |
| GSM109207.cel.gz | GSM109207 | rep1 | GSM109207_rep1 | MCF10A-Myc | 2hr | dex | dex_2hr | 35 | 05/23/02 |
| GSM109208.cel.gz | GSM109208 | rep1 | GSM109208_rep1 | MCF10A-Myc | 4hr | baseline | baseline_4hr | 35 | 05/23/02 |
# summary of ScanDate_Group
res=as.data.frame(table(pData(raw.data)$ScanDate_Group))
names(res) <- c("ScanDate_Group","Frequency")
pandoc.table(res,split.table=Inf) # show first 5 rows
| ScanDate_Group | Frequency |
|---|---|
| 04/13/05 | 8 |
| 05/23/02 | 7 |
| 05/31/02 | 8 |
| 06/03/02 | 1 |
Check if the sample names derived from expression data match those in phenotype file
The signal intensity distributions of all samples (arrays) are expected to have the similar scale (i.e. the similar positions and widths of the boxes). Use Kolmogorov-Smirnov statistic to detect the arrays that have values deviated from others, which may indicate an experimental problem.
raw.data.log2 <- log2(exprs(raw.data))
dim(raw.data.log2)
## [1] 506944 24
Use subsamp function to reduce intensity data with randomly selected 20000 probes
# subset data
subsamp <- function(x) {
subsample=20000 # if number of probes are >20000, randomly select 20000 probes for plot or compute
if (nrow(x)>subsample) {
ss = sample(nrow(x), subsample)
Mss = x[ss,,drop=FALSE]
} else {
ss = TRUE
Mss = x
}
Mss
}
Mss_raw <- subsamp(raw.data.log2)
Compute the Kolmogorov-Smirnov statistic Ka between each array’s (i.e. sample) values (i.e. log2 transformed raw probe intensity values) and the pooled, overall distribution of the values. Detect outliers that are deviated from the threshold using outlier_detection_KS function.
# compute KS statistics to detect outliers
outlier_detection_KS = function(exprs) { # matrix (row: probe intensities/RLE values etc., col: array (e.g. sample))
fx = ecdf(as.vector(exprs)) # get empirical cumulative distribution function of the data
KS=suppressWarnings(apply(exprs, 2, function(v)ks.test(v, y = fx, alternative="two.sided")$statistic))
stats = stats::fivenum(KS, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
coef = 1.5
th = (stats[4] + coef * iqr)
list(threshold = th, stats=KS, outlier = which(KS > th))
}
outlier_raw = names(outlier_detection_KS(Mss_raw)$outlier)
Use boxplot_func function to plot raw probe intensities. Outliers marked with * have different raw proble intensity distributions from others
# boxplot function
boxplot_func <- function(Mss,outlier,ylab) {
# use * to mark the outliers in boxplot
array_name <- colnames(Mss)
array_name[array_name%in%outlier] <- paste0("*",outlier)
# boxplot RLE by array
ylim = quantile(Mss, probs = c(0.01, 0.99), na.rm=TRUE) # create range of y-axsis
# create data frame for plot
df <- data.frame(
sample_id=rep(colnames(Mss),each=nrow(Mss)),
values=as.numeric(Mss),
scandate=rep(pData(raw.data)$ScanDate_Group,each=nrow(Mss)) # for color
)
cols <- colour_list
ggplot(df, aes(sample_id,values,fill=scandate)) + geom_boxplot(outlier.colour=NA) +
coord_flip() + theme_bw() +
ylim(ylim) +
scale_x_discrete(labels=array_name) +
ylab(ylab) +
scale_fill_manual("Scan Date",values=cols) +
theme(axis.title.y=element_blank())
}
boxplot_func(Mss_raw,outlier_raw,"Raw Probe Intensities")
This step requires manual inspection. The intensity curves of all samples (arrays) are expected to have the similar shapes and ranges. Samples with deviated curves are likely to have problematic experiments. Various features of the distributions can be indicative of quality related phenomena. For instance, high levels of background will shift an array’s distribution to the right. Lack of signal diminishes its right right tail. A bulge at the upper end of the intensity range often indicates signal saturation.
# create data frame for plot
df <- data.frame(
sample_id=rep(colnames(Mss_raw),each=nrow(Mss_raw)),
values=as.numeric(Mss_raw),
scandate=rep(pData(raw.data)$ScanDate_Group,each=nrow(Mss_raw)) # for color
)
cols <- colour_list
ggplot(df,aes(x=values,colour=scandate)) + geom_line(aes(group=sample_id),stat="density") +
theme_bw() +
xlab("Raw Probe Intensities") +
ylab("Density") +
scale_color_manual("Scan Date",values=cols)
rm(Mss_raw,df)
This step is Affymetrix microarray-specific QC analysis. There are two paired probe types: perfect match (PM) and of mismatched (MM) probes. A PM probe matches a strand of cDNA, while the corresponding MM probe differs from the PM by a change in the central nucleotide.
A probe set is called present if the intensity value of PM is significantly larger than MM.
The Affymetrix approach is under attack because between 15% - 30% of the MM are greater than the PM. For some newer arrays, MM probes are not used. If the number of PMs is not equal to that of MMs, this might be a PM-only array.
If this study has both PM and MM, a plot will be generated that shows the density distributions of the log2-transformed intensities grouped by the matching type of the probes. We expect that MM probes have poorer hybridization than PM probes, and thus that the PM curve is to the right of the MM curve.
if (class(raw.data)=="GeneFeatureSet") {
message("GeneFeatureSet does not have MM probes. No plots will be generated.")
} else {
PM <- log2(pm(raw.data))
MM <- log2(mm(raw.data))
if(nrow(PM) != nrow(MM)) {
message("This might be a PM-only array. No plots will be generated.")
rm(PM,MM)
} else {
subsample=20000
if(nrow(PM)>subsample) { # randomly select 20000 probe sets
sel = sample(nrow(PM), subsample)
sPM = PM[sel, ]
sMM = MM[sel, ]
} else {
sPM = PM
sMM = MM
}
rm(PM,MM)
df <- data.frame(
values=c(as.numeric(sPM),as.numeric(sMM)),
types=c(rep("PM",each=length(as.numeric(sPM))),rep("MM",each=length(as.numeric(sMM))))
)
cols=colours[1:2]
ggplot(df,aes(x=values,colour=types)) + geom_line(aes(group=types),stat="density") +
theme_bw() +
xlab("Intensity") +
ylab("Density") +
scale_color_manual(values=cols) +
theme(legend.title=element_blank())
}
}
rm(sPM,sMM,df)
Overall RNA quality can be assessed by RNA degradation plots. In the gene expression array, each gene is represented by a probe set. Each probe set is 11-20 probes (pairs of oligos). This plot shows the average intensity of each probe across all probe sets, ordered from the 5´ to the 3´ end. It is expected that probe intensities are lower at the 5´ end of a probe set when compared to the 3´end as RNA degradation starts from the 5´end of a molecule. RNA which is too degraded will show a very high slope from 5´ to 3´. Thus, the standardized slope of the RNA degradation plot serves as quantitative indicator of the RNA degradation.
This step is an Affymetrix microarry-specific QC step, as the probes in each probe set matrix generated by affy package are ordered from 5’ to 3’, which is not implemented in oligo package.
Generate raw data object under AffyBatch (affy class) for affymetrix microarray-specific QC analysis.
library(affy) # for Affymetrix microarray-specific QC analysis
raw.data.affy <- read.affybatch(celfiles,compress=T)
Compute mean PM intensities in each probe position and plot in the next step. Adopt AffyRNAdeg function from affy package but include a step that randomly selects 20000 probe sets.
RNAdegaffy <- function(data){ # input a list of probe set matrix with rows as probe ids and columns as samples
{
names <- colnames(data[[1]])
probe.set.size <- function(x) {
size <- dim(x)[1]
return(size)
}
max.num <- sapply(data, probe.set.size) # get the number of probes in each probe set
tab <- (table(max.num)) # summarize the frequencies of probe numbers in probe sets
ord <- order(-as.numeric(tab)) # order the frequency from large to small
K <- as.numeric(names(tab))[ord[1]] # K is the number of probes appearing in most probe sets
data <- data[max.num == K] # select data of probe sets only have K number of probes
}
subsample=20000
if (length(data)>subsample) { # randomly select 10000 probe sets
ss = sample(length(data),subsample)
data = data[ss,drop=FALSE]
}
N <- length(data) # number of probe sets
n <- dim(data[[1]])[2] # number of samples
# create two matrices: number of samples * number of probes representing a probe set
mns <- matrix(nrow = n, ncol = K) # create matrix for mean values
sds <- mns # create matrix for sds values
get.row <- function(x, i = 1) {return(x[i, ])} # function to get each row (i.e. probe id, i) from one probe set x (i.e. probe list[[x]])
rowstack <- function(x, i = 1) {return(t(sapply(x, get.row, i)))} # function to combine the rows obtained using get.row (pms across samples by probe sets) to get a table (row: samples column: probe sets) and transpose the table (row: probe sets, column: samples)
for (i in 1:K) { # get probe id (position) from 1 to K from each probe set
data.stack <- rowstack(data, i) # get the probe pm values in a specific probe position across all samples from each probe set (rows are samples and columns are probe sets)
if(dim(data[[1]])[2]==1) data.stack <- t(data.stack)
mns[, i] <- colMeans(data.stack) # get the mean values at one probe position across all probe sets
sds[, i] <- apply(data.stack, 2, sd) # get the sd values at one probe position across all probe sets
}
mns.orig <- mns # store the original mns data matrix
mn <- mns[, 1] # select values in the first probe position
mns <- sweep(mns, 1, mn) # adjust for the intensity at the first probe position
mns <- mns/(sds/sqrt(N)) # adjust for standard error
lm.stats <- function(x) {
index <- 0:(length(x) - 1)
ans <- summary(lm(x ~ index))$coefficients[2, c(1, 4)] # use linear model fit the relationship between intensity and probe position
return(ans)
}
stats <- apply(mns, 1, lm.stats)
answer <- list(N, names, mns.orig, sds/sqrt(N), stats[1,], stats[2, ])
names(answer) <- c("N", "sample.names", "means.by.number","ses", "slope", "pvalue")
return(answer)
}
Obtain a list of probe sets with a matrix of oligos (probes) by samples as an input and compute statistics of the mean PM intensities from 5’ to 3’ probe positions.
PM_list <- affy::pm(raw.data.affy,LIST=T)
PM_list <- lapply(PM_list,log2)
raw.data.rnadeg <- RNAdegaffy(PM_list)
rm(PM_list)
status.cols <- unlist(lapply(pData(raw.data)$ScanDate_Group,function(x)colour_list[x])) # colour list to corresponding scan date list
plotAffyRNAdeg(raw.data.rnadeg,cols=status.cols)
legend("topleft",legend=names(colour_list),fill=colour_list,cex=0.6)
detach("package:affy", unload=TRUE) # detach affy package
MA plots allow pairwise comparison of the log-intensity of each array to a reference array and identification of intensity-dependent biases.
The y-axis of the MA-plot shows the log-ratio intensity of one array to the reference median array, which is called M (minus). M = log2(I1)-log2(I2) (I1: the intensity of the array studied; I2: the median intensity across arrays)
The x-axis indicates the average log-intensity of both arrays, which is called A (add). A = 1/2*(log2(I1)+log2(I2))
It is expected that the probe levels do not differ systematically within a group of replicates, so that the MA-plot is centered on the y-axis (y=0 or M=0) from low to high intensities.
Use the Hoeffding’s statistic Da to detect the outliers.
MA_cal <- function(x) { # matrix (row: probe intensities, col: array (samples)
medArray = rowMedians(x, na.rm=TRUE)
M = x - medArray
A = (x + medArray)/2
subsample=20000
if(nrow(M)>subsample) {
sel = sample(nrow(M), subsample)
sM = M[sel, ]
sA = A[sel, ]
} else {
sM = M
sA = A
}
list(M=sM,A=sA) # return a list with M and A data matrices
}
sMA <- MA_cal(raw.data.log2)
Compute the Hoeffding’s statistic Da (hoeffd function from Hmisc package) on the joint distribution of A and M for each array. Detect outliers that have a Da value greater than 0.15, the default threshold.
outlier_detection_MA = function(exprs) { # list with M and A data matrices
M=exprs$M
A=exprs$A
Dstats = sapply(1:ncol(M), function(x){hoeffd(A[,x], M[,x])$D[1,2]})
names(Dstats) <- colnames(M)
Dthresh = 0.15
list(threshold=Dthresh, stats=Dstats, outlier = which(Dstats > Dthresh))
}
outlier_res_MA <- outlier_detection_MA(sMA)
outlier_MA <- names(outlier_res_MA$outlier)
The MA plots show samples with the first 4 highest and lowest values of Da. The value of Da for each sample is shown in the panel headings. Outliers marked with * have Da values >0.15.
# select arrays with top 4 highest and lowest Da
stats_order <- order(outlier_res_MA$stats)
column_sel <- stats_order[c(1:4,(ncol(sMA$M)-3):ncol(sMA$M))]
stats_sel <- round(outlier_res_MA$stats[column_sel],2)
scandate_sel <- pData(raw.data)$ScanDate_Group[column_sel]
M_sel <- sMA$M[,column_sel]
A_sel <- sMA$A[,column_sel]
# use * to mark the outliers
array_name <- colnames(M_sel)
array_name[array_name%in%outlier_MA] <- paste0("*",array_name[array_name%in%outlier_MA])
array_name <- paste0(array_name," (D=",stats_sel,")") # add D statistics to corresponding samples
# create data frame for plot
df <- data.frame(
sample_id=rep(array_name,each=nrow(M_sel)),
scandate=rep(scandate_sel,each=nrow(M_sel)),
M=as.numeric(M_sel),
A=as.numeric(A_sel)
)
# MA plots
ggplot(df,aes(x=A,y=M,color=scandate)) + geom_point(alpha=0.1) + theme_bw() +
scale_color_manual("Scan Date",values=colour_list) +
facet_wrap(~sample_id,ncol=2)
rm(sMA,df)
Spatial images are artificial visualizations of an array that are created to detect spatial trends or biases on an array. Upon scanning, a scale factor is applied to each array of a dataset in order to equalize the arrays mean intensities. A dataset of good quality should have comparable scale factors with limited variation.
Normally, when the features are distributed randomly on the arrays, one expects to see a uniform distribution; control features with particularly high or low intensities may stand out. The rank scale is applied for plotting as it has the potential to amplify patterns that are small in amplitudebut systematic within an array.
This step is an Affymetrix microarry-specific QC step, as the AffyBatch object from affy package contains information of spatial x- and y-coordinate that is not implemented in oligo package.
library(affy)
affy_spatial <- function(affybatch) {
maxc = ncol(affybatch) # number of probes in x-coordinate
maxr = nrow(affybatch) # number of probes in y-coordinate
sx = rep(seq_len(maxc), each = maxr) ## spatial x-coordinate
sy = rep(seq_len(maxr), maxc) ## spatial y-coordinate
M = log2(affy::exprs(affybatch))
numArrays = dim(M)[2]
list(M=M,numArrays=numArrays,sx=sx,sy=sy)
}
raw.data.spatial <- affy_spatial(raw.data.affy)
detach("package:affy", unload=TRUE) # detach affy package
Compute the sum of the absolutes value of low frequency Fourier coefficients (Fa) across all the probes as a measure of large scale spatial structures. Detect outliers that have a Fa greater than the threshold.
outlier_detection_spatial <- function(affy_spatial_list) {
sx=affy_spatial_list$sx # spatial x-coordinate
sy=affy_spatial_list$sy # spatial y-coordinate
M=affy_spatial_list$M
numArrays=affy_spatial_list$numArrays
maxx = max(sx, na.rm=TRUE)
maxy = max(sy, na.rm=TRUE)
stat_spatial = numeric(numArrays)
for(a in seq_len(numArrays)) {
mat = matrix(NA_real_, nrow=maxy, ncol=maxx)
mat[cbind(sy, sx) ] = M[, a]
pg = fft(mat) ## periodogram, computes the discrete fourier transform
npg = Re(pg*Conj(pg))
npg[1,1] = 0 ## drop the constant component
stat_spatial[a] = sqrt(sum(npg[1:4, 1:4]) / sum(npg)) # low frequency power
}
names(stat_spatial)=colnames(M)
stats = stats::fivenum(stat_spatial, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
coef = 1.5
th = (stats[4] + coef * iqr)
list(threshold = th, stats=stat_spatial, outlier = which(stat_spatial > th))
}
outlier_res_spatial=outlier_detection_spatial(raw.data.spatial)
outlier_spatial=names(outlier_res_spatial$outlier)
The spatial distribution plots show samples with the first 4 highest and lowest values of Fa. The value of Fa for each sample is shown in the panel headings. Outliers marked with * have Fa values of large scale spatial structures.
# select arrays with top 4 highest and lowest Da
stats_order <- order(outlier_res_spatial$stats)
column_sel <- stats_order[c(1:4,(ncol(raw.data.spatial$M)-3):ncol(raw.data.spatial$M))]
stats_sel <- round(outlier_res_spatial$stats[column_sel],2)
M_sel <- raw.data.spatial$M[,column_sel]
# apply rank to expression data
M_sel = apply(M_sel, 2, rank)
# use * to mark the outliers
array_name <- colnames(M_sel)
array_name[array_name%in%outlier_spatial] <- paste0("*",array_name[array_name%in%outlier_spatial])
array_name <- paste0(array_name," (F=",stats_sel,")") # add F statistics to corresponding samples
# create variables for plot
df <- data.frame(
sample_id=rep(array_name,each=nrow(M_sel)),
M=as.numeric(M_sel),
row=rep(raw.data.spatial$sy,ncol(M_sel)),
column=rep(raw.data.spatial$sx,ncol(M_sel))
)
# spatial distribution plots
ggplot(df,aes(x=row,y=column,fill=M)) + geom_tile() +
theme_bw() +
xlab("Raw Probe Intensiry in X") + ylab("Raw Probe Intensiry in Y") +
scale_fill_gradientn(name="Ranked Intensity",colours=viridis(256,option="B")) +
facet_wrap(~sample_id,ncol=2)
rm(raw.data.spatial,df)
The RLE values are computed by calculating for each probe-set the ratio between the expression of a probe-set and the median expression of this probe-set across all arrays of the experiment.
It is assumed that most probe-sets are not changed across the arrays, so it is expected that these ratios are around 0 on a log scale.
The boxplots presenting the distribution of these log-ratios should then be centered near 0 and have similar spread. Other behavior would be a sign of low quality.
First, fit probe level models for quality assessment using fitProbeLevelModel from Oligo package. Then, use RLE and NUSE functions to compute the RLE and NUSE values respectively.
raw.data2 <- raw.data # As fitPLM needs ExpressionFeatureSet object as an input, assign log transformed expression values to a new object "raw.data2"
exprs(raw.data2) <- raw.data.log2
fitPLM <- fitProbeLevelModel(raw.data)
fitPLM
rm(raw.data2)
M_RLE <- RLE(fitPLM, type="values") # generate RLE matrix
Mss_RLE <- subsamp(M_RLE) # use subsamp function to reduce RLE data with randomly selected 20000 probes
Compute the Kolmogorov-Smirnov statistic Ra between each array’s (i.e. sample) values (i.e. relative log expression values) and the pooled, overall distribution of the values. Detect outliers that are deviated from the threshold.
outlier_RLE = names(outlier_detection_KS(Mss_RLE)$outlier) # compute KS statistics to detect outliers
Use boxplot_func function to plot RLE. Outliers marked with * have values centered away from 0 and/or are more spread out are potentially problematic.
boxplot_func(Mss_RLE,outlier_RLE,"RLE")
rm(Mss_RLE)
The NUSE plot represents normalized standard error estimates from the Probe-Level Model (PLM) fit which computes expression measures on a probe set by probe set basis.
The standard error estimates are normalized so that the median standard error for each probe set across all arrays is equal to 1. A box plot of NUSE values is drawn for each array in a dataset and allows checking whether all distributions are centered near 1 and whether an array shows a globally higher spread of the NUSE distribution than the other arrays. Typically, a box centered above 1.1 is indicative of an array with quality problems
M_NUSE <- NUSE(fitPLM, type="values") # generate NUSE matrix
Mss_NUSE <- subsamp(M_NUSE) # use subsamp function to reduce RLE data with randomly selected 20000 probes
Compute 75% quantile Na of each array’s NUSE values Detect outliers that have larger Na deviated from the threshold.
# compute upper 75% quantile as statistics to detect outliers
outlier_detection_upperquartile = function(exprs) { # matrix (row: NUSE values, col: array (e.g. sample))
upperquartile = apply(exprs, 2, quantile, na.rm=TRUE, probs=0.75)
stats = stats::fivenum(upperquartile, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
coef = 1.5
th = (stats[4] + coef * iqr)
list(threshold = th, outlier = which(upperquartile > th))
}
outlier_NUSE = names(outlier_detection_upperquartile(Mss_NUSE)$outlier) # compute upper 75% quantile statistics to detect outliers
Use boxplot_func function to plot RLE. Outliers marked with * have values centered away from 0 and/or are more spread out are potentially problematic.
boxplot_func(Mss_NUSE,outlier_NUSE,"NUSE")
rm(Mss_NUSE)
Correlation between arrays was evaluated by the performance of hierarchical clustering of arrays. Hierarchical clustering analysis is performed in two steps: first, the distances between all pairs of arrays are computed and second, a tree is created from these distances by repeatedly grouping those (groups of) arrays together that are closest to each other.
The distance d(ab) between two arrays a and b is computed as the mean absolute difference (L1-distance) between the data of the arrays (using the data from all probes without filtering). In formula, d(ab) = mean | M(ai) - M(bi) |, where M(ai) is the value of the i-th probe on the a-th array.
Outlier detection was computed using the sum of the distances to all other arrays, of which samples who have exceptionally large S(a) = Sum(b)d(ab) are outliers.
Distance estimation using dist2 function derived from genefilter R package
# dist2 estimation
dist2 = function (x,
fun = function(a, b) mean(abs(a - b), na.rm = TRUE),
diagonal = 0) {
if (!(is.numeric(diagonal) && (length(diagonal) == 1)))
stop("'diagonal' must be a numeric scalar.")
if (missing(fun)) {
res = apply(x, 2, function(w) colMeans(abs(x-w), na.rm=TRUE))
} else {
res = matrix(diagonal, ncol = ncol(x), nrow = ncol(x))
if (ncol(x) >= 2) {
for (j in 2:ncol(x))
for (i in 1:(j - 1))
res[i, j] = res[j, i] = fun(x[, i], x[, j])
} # if
} # else
colnames(res) = rownames(res) = colnames(x)
return(res)
}
m <- dist2(raw.data.log2)
dend = as.dendrogram(hclust(as.dist(m), method = "single"))
ord = order.dendrogram(dend)
outlier_detection_dist = function(exprs) { # matrix (row: distance to each sample, col: array (e.g. sample))
sum = colSums(exprs, na.rm=TRUE) # sum the total distance
stats = stats::fivenum(sum, na.rm = TRUE) # Tukey's five number summary (minimum, lower-hinge, median, upper-hinge, maximum)
iqr = diff(stats[c(2, 4)]) # lagged difference between the lower-hinge and upper-hinge
coef = 1.5
th = (stats[4] + coef * iqr)
list(threshold = th, outlier = which(sum > th))
}
outlier_dist = names(outlier_detection_dist(m)$outlier)
array_name <- colnames(m)
array_name[array_name%in%outlier_dist] <- paste0("*",outlier_dist)
array_name <- gsub(".CEL.gz|.cel.gz","",array_name) # shorten the sample id
status.cols <- unlist(lapply(pData(raw.data)$ScanDate_Group,function(x)colour_list[x])) # colour list to corresponding scan date list
heatmap.2(m,,Rowv=dend,Colv=dend,
col=viridis(256, option="B"),ColSideColors=status.cols,RowSideColors=status.cols,
labCol=array_name,labRow=array_name,
trace="none",
margins=c(12,20), # (bottom margin, left margin)
cexRow = 1,cexCol = 1,
keysize=1.5,key.title=NA,key.xlab="Dist2",key.ylab="Counts")
legend("bottomleft",legend=names(colour_list),fill=colour_list,cex=0.6)
rm(m)
PCA plots the information contained in the dataset in a reduced number of dimensions. Clustering and PCA plots enable the assessment to what extent arrays resemble each other, and whether this corresponds to the known resemblances of the samples.
# obtain original expression data
raw.data.pca <- na.omit(raw.data.log2)
# As scale function divides by the variance, the probe with the expression sd=0 across samples must be removed.
sd <- apply(raw.data.pca,1,sd)
raw.data.pca <- raw.data.pca[!sd==0,]
# compute pcs
pca <- prcomp(t(raw.data.pca), retx = TRUE, center = TRUE, scale = TRUE)
pc <- data.frame(pca$x)
# compute variance explained by each PC
vars <- pca$sdev^2
pcs <- t(pc)
pvars <- vars*100.0/sum(vars) # proportion of variance (%) explained by each PC
cumsum_pvars <- cumsum(pvars) # Cumulative Proportion of Variance (%)
if (nrow(pcs)>10) {nres <- 10} else {nres=nrow(pcs)} # select top 10 PCs if number of PCs >10
res <- data.frame(rownames(pcs),pvars,cumsum_pvars)[1:nres,]
names(res) <- c("PC","Proportion of Variance (%)","Cumulative Proportion of Variance (%)")
pandoc.table(res,split.table = Inf)
| PC | Proportion of Variance (%) | Cumulative Proportion of Variance (%) |
|---|---|---|
| PC1 | 85.75 | 85.75 |
| PC2 | 5.7 | 91.45 |
| PC3 | 2.186 | 93.64 |
| PC4 | 1.211 | 94.85 |
| PC5 | 0.979 | 95.83 |
| PC6 | 0.64 | 96.47 |
| PC7 | 0.4505 | 96.92 |
| PC8 | 0.4126 | 97.33 |
| PC9 | 0.3629 | 97.7 |
| PC10 | 0.3118 | 98.01 |
rm(raw.data.pca)
PCA plots by scan date
cols <- colour_list
df <- data.frame(
PC1=pc$PC1,
PC2=pc$PC2,
scandate=pData(raw.data)$ScanDate_Group
)
ggplot(df,aes(PC1,PC2,color=scandate)) + geom_point() +
theme_bw() +
scale_color_manual("Scan Date",values=cols)
PCA plots by treatment if more than 2 subjects involving in the same treatment condition
if(("Treatment"%in%colnames(pData(raw.data)))&(min(summary(pData(raw.data)$Treatment))>2)) {
# Assign colour to treatment status
i=nlevels(pheno$Treatment)
colour_treat <- colours[1:i]
names(colour_treat) <- levels(pData(raw.data)$Treatment)
cols <- colour_treat
df <- data.frame(
PC1=pc$PC1,
PC2=pc$PC2,
group=pData(raw.data)$Treatment
)
ggplot(df,aes(PC1,PC2,color=group)) + geom_point() +
theme_bw() +
scale_color_manual("Treatment",values=cols)
}
PCA plots by donor if the same donor involving in more than 2 tissue/treatment conditions
pData(raw.data)$Subject<-as.factor(pData(raw.data)$Subject)
if(("Treatment"%in%colnames(pData(raw.data)))&(min(summary(pData(raw.data)$Subject))>2)) {
# Assign colour to subject status
i=nlevels(pData(raw.data)$Subject)
colour_subj <- colours[1:i]
names(colour_subj) <- levels(pheno$Subject)
cols <- colour_subj
df <- data.frame(
PC1=pc$PC1,
PC2=pc$PC2,
group=pData(raw.data)$Subject
)
ggplot(df,aes(PC1,PC2,color=group)) + geom_point() +
theme_bw() +
scale_color_manual("Donor",values=cols)
}
rm(pca)
Summary outliers and detection methods in a table
outlier_all <- c(outlier_raw,outlier_MA,outlier_spatial,outlier_RLE,outlier_NUSE,outlier_dist) # all outliers
if (length(outlier_all)==0) {
message("No outlier was detected")
outlier_freq <- data.frame() # create an empty data frame
} else {
outlier_method <- c(rep("Raw Probe Intensity",length(outlier_raw)),
rep("MA",length(outlier_MA)),
rep("Spatial Distribution",length(outlier_spatial)),
rep("RLE",length(outlier_RLE)),
rep("NUSE",length(outlier_NUSE)),
rep("Dist2",length(outlier_dist))) # detection methods for the outliers
outlier_list <- list() # create empty list to store outliers and their corresponding methods
for (i in 1:length(outlier_all)){outlier_list[[outlier_all[i]]] <- append(outlier_list[[outlier_all[i]]],outlier_method[i])} # assign methods to each outlier
# summary table
Frequency <- sapply(outlier_list,length) # times to detect
Method <- unlist(lapply(names(outlier_list),function(x){paste0(outlier_list[[x]],collapse=", ")}))
outlier_freq <- data.frame(Frequency,Method)
pandoc.table(outlier_freq[order(outlier_freq$Frequency),],caption="Outlier Summary",split.table = Inf)
}
| Frequency | Method | |
|---|---|---|
| GSM109212.cel.gz | 1 | MA |
| GSM109217.cel.gz | 1 | MA |
| GSM109210.cel.gz | 1 | NUSE |
| GSM109215.cel.gz | 3 | Spatial Distribution, RLE, NUSE |
Create a new column “QC_Pass” in phenotype data. Samples detected as outliers more than twice are assigned to 0 otherwise to 1.
if (nrow(outlier_freq)==0) { # if no outlier was detected
pData(raw.data)$QC_Pass <- rep(1,nrow(pData(raw.data))) # assign 1 to all the samples
} else {
outliers <- row.names(outlier_freq)[which(outlier_freq$Frequency>2)] # remove samples that were detected as an outlier more than twice
pData(raw.data)$QC_Pass <- rep(NA,nrow(pData(raw.data))) # add column QC_Pass in phnotype data
pData(raw.data)$QC_Pass[rownames(pData(raw.data))%in%outliers] <- 0 # assign 0 to outliers to be removed
pData(raw.data)$QC_Pass[!rownames(pData(raw.data))%in%outliers] <- 1 # assign 1 to samples to be kept
}
pandoc.table(pData(raw.data),split.table = Inf)
| GEO_ID | Subject | Sample | Tissue | treatment_time | treatment_drug | Treatment | Age | ScanDate_Group | QC_Pass | |
|---|---|---|---|---|---|---|---|---|---|---|
| GSM109204.cel.gz | GSM109204 | rep1 | GSM109204_rep1 | MCF10A-Myc | 30min | baseline | baseline_30min | 35 | 05/23/02 | 1 |
| GSM109205.cel.gz | GSM109205 | rep1 | GSM109205_rep1 | MCF10A-Myc | 30min | dex | dex_30min | 35 | 05/23/02 | 1 |
| GSM109206.cel.gz | GSM109206 | rep1 | GSM109206_rep1 | MCF10A-Myc | 2hr | baseline | baseline_2hr | 35 | 05/23/02 | 1 |
| GSM109207.cel.gz | GSM109207 | rep1 | GSM109207_rep1 | MCF10A-Myc | 2hr | dex | dex_2hr | 35 | 05/23/02 | 1 |
| GSM109208.cel.gz | GSM109208 | rep1 | GSM109208_rep1 | MCF10A-Myc | 4hr | baseline | baseline_4hr | 35 | 05/23/02 | 1 |
| GSM109209.cel.gz | GSM109209 | rep1 | GSM109209_rep1 | MCF10A-Myc | 4hr | dex | dex_4hr | 35 | 05/23/02 | 1 |
| GSM109210.cel.gz | GSM109210 | rep1 | GSM109210_rep1 | MCF10A-Myc | 24hr | baseline | baseline_24hr | 35 | 05/31/02 | 1 |
| GSM109211.cel.gz | GSM109211 | rep1 | GSM109211_rep1 | MCF10A-Myc | 24hr | dex | dex_24hr | 35 | 05/23/02 | 1 |
| GSM109212.cel.gz | GSM109212 | rep2 | GSM109212_rep2 | MCF10A-Myc | 30min | baseline | baseline_30min | 35 | 05/31/02 | 1 |
| GSM109213.cel.gz | GSM109213 | rep2 | GSM109213_rep2 | MCF10A-Myc | 30min | dex | dex_30min | 35 | 05/31/02 | 1 |
| GSM109214.cel.gz | GSM109214 | rep2 | GSM109214_rep2 | MCF10A-Myc | 2hr | baseline | baseline_2hr | 35 | 05/31/02 | 1 |
| GSM109215.cel.gz | GSM109215 | rep2 | GSM109215_rep2 | MCF10A-Myc | 2hr | dex | dex_2hr | 35 | 06/03/02 | 0 |
| GSM109216.cel.gz | GSM109216 | rep2 | GSM109216_rep2 | MCF10A-Myc | 4hr | baseline | baseline_4hr | 35 | 05/31/02 | 1 |
| GSM109217.cel.gz | GSM109217 | rep2 | GSM109217_rep2 | MCF10A-Myc | 4hr | dex | dex_4hr | 35 | 05/31/02 | 1 |
| GSM109218.cel.gz | GSM109218 | rep2 | GSM109218_rep2 | MCF10A-Myc | 24hr | baseline | baseline_24hr | 35 | 05/31/02 | 1 |
| GSM109219.cel.gz | GSM109219 | rep2 | GSM109219_rep2 | MCF10A-Myc | 24hr | dex | dex_24hr | 35 | 05/31/02 | 1 |
| GSM109220.CEL.gz | GSM109220 | rep3 | GSM109220_rep3 | MCF10A-Myc | 30min | baseline | baseline_30min | 35 | 04/13/05 | 1 |
| GSM109221.CEL.gz | GSM109221 | rep3 | GSM109221_rep3 | MCF10A-Myc | 30min | dex | dex_30min | 35 | 04/13/05 | 1 |
| GSM109222.CEL.gz | GSM109222 | rep3 | GSM109222_rep3 | MCF10A-Myc | 2hr | baseline | baseline_2hr | 35 | 04/13/05 | 1 |
| GSM109223.CEL.gz | GSM109223 | rep3 | GSM109223_rep3 | MCF10A-Myc | 2hr | dex | dex_2hr | 35 | 04/13/05 | 1 |
| GSM109224.CEL.gz | GSM109224 | rep3 | GSM109224_rep3 | MCF10A-Myc | 4hr | baseline | baseline_4hr | 35 | 04/13/05 | 1 |
| GSM109225.CEL.gz | GSM109225 | rep3 | GSM109225_rep3 | MCF10A-Myc | 4hr | dex | dex_4hr | 35 | 04/13/05 | 1 |
| GSM109226.CEL.gz | GSM109226 | rep3 | GSM109226_rep3 | MCF10A-Myc | 24hr | baseline | baseline_24hr | 35 | 04/13/05 | 1 |
| GSM109227.CEL.gz | GSM109227 | rep3 | GSM109227_rep3 | MCF10A-Myc | 24hr | dex | dex_24hr | 35 | 04/13/05 | 1 |
pData(raw.data)$Filename <- rownames(pData(raw.data)) # add filename in a new column
Save in a new phenotype file
write.table(pData(raw.data),paste0(datadir,"/",geo_id,"_Phenotype_withQC.txt"),col.names=T,row.names=F,sep="\t",quote=F)
Create variables for subsetting phenotype data into separate comparison groups (e.g. asthma vs. healthy, rhinitis vs. healthy). Assign NULL values if the corresponding disease/treatment status is not available. This is a manual step.
disease_con1 <- NULL
disease_con0 <- NULL
treatment_con1 <- c("dex_24hr","dex_2hr","dex_30min","dex_4hr")
treatment_con0 <- c("baseline_24hr","baseline_2hr","baseline_30min","baseline_4hr")
Subset phenotype file by tissue, disease status and treatment (if available). Differential gene expression analysis will be performed in a separate group.
Use subdat function to subset phenotype file into separate files than only contain one tissue type, one treatment/disease status of the disease vs. healthy/treatment vs. baseline comparison.
The phenotype file is named as: geo_id + Tissue + Disease/Treatment status (if available) + treatment vs baseline/disease vs healthy
subdat <- function(pheno) {
tissues=levels(pheno$Tissue)
for (t in tissues) {
pheno_tis <- pheno[pheno$Tissue==t,]
if (!is.null(disease_con1)&(!is.null(treatment_con1))) { # Data contains both disease status and treatment
for (tr in levels(as.factor(c(treatment_con1,treatment_con0)))) { # under the same treatment, get disease and healthy samples
pheno_tr <- pheno_tis[pheno_tis$Treatment==tr,]
for (i in 1:length(disease_con1)) {
if (disease_con1[i]%in%pheno_tr$Disease&disease_con0[i]%in%pheno_tr$Disease) { # under the same treatment, if the data contains both disease and healthy
pheno_dis <- pheno_tr[pheno_tr$Disease%in%c(disease_con1[i],disease_con0[i]),]
name <- paste0(resdir,"/",geo_id,"_",t,"_",tr,"_",disease_con1[i],"_vs_",disease_con0[i],".txt")
write.table(pheno_dis,name,col.names=T,row.names=F,sep="\t",quote=F)
}
}
}
for (dis in levels(as.factor(c(disease_con1,disease_con0)))) { # under the same disease status, get treatment and baseline samples
pheno_dis <- pheno_tis[pheno_tis$Disease==dis,]
for (i in 1:length(treatment_con1)) {
if (treatment_con1[i]%in%pheno_dis$Treatment&treatment_con0[i]%in%pheno_dis$Treatment) { # under the same disease status, if the data contains both treatment and baseline
pheno_tr <- pheno_dis[pheno_dis$Treatment%in%c(treatment_con1[i],treatment_con0[i]),]
name <- paste0(resdir,"/",geo_id,"_",t,"_",dis,"_",treatment_con1[i],"_vs_",treatment_con0[i],".txt")
write.table(pheno_tr,name,col.names=T,row.names=F,sep="\t",quote=F)
}
}
}
}
if (!is.null(disease_con1)&(is.null(treatment_con1))) { # Data only contains disease status
for (i in 1:length(disease_con1)) {
if (disease_con1[i]%in%pheno_tis$Disease&disease_con0[i]%in%pheno_tis$Disease) { # if the data contains both disease and healthy samples
pheno_dis <- pheno_tis[pheno_tis$Disease%in%c(disease_con1[i],disease_con0[i]),]
name <- paste0(resdir,"/",geo_id,"_",t,"_",disease_con1[i],"_vs_",disease_con0[i],".txt")
write.table(pheno_dis,name,col.names=T,row.names=F,sep="\t",quote=F)
}
}
}
if (is.null(disease_con1)&(!is.null(treatment_con1))) { # Data only contains treatment status
for (i in 1:length(treatment_con1)) {
if (treatment_con1[i]%in%pheno_tis$Treatment&treatment_con0[i]%in%pheno_tis$Treatment) { # if the data contains both treatment and baseline samples
pheno_tr <- pheno_tis[pheno_tis$Treatment%in%c(treatment_con1[i],treatment_con0[i]),]
name <- paste0(resdir,"/",geo_id,"_",t,"_",treatment_con1[i],"_vs_",treatment_con0[i],".txt")
write.table(pheno_tr,name,col.names=T,row.names=F,sep="\t",quote=F)
}
}
}
}
}
subdat(pData(raw.data))
Show all the generated separate phenotype files
list.files(path=resdir,pattern=paste0("^",geo_id,".*","_vs_",".*","\\.txt"))
## [1] "GSE4917_MCF10A-Myc_dex_24hr_vs_baseline_24hr.txt"
## [2] "GSE4917_MCF10A-Myc_dex_2hr_vs_baseline_2hr.txt"
## [3] "GSE4917_MCF10A-Myc_dex_30min_vs_baseline_30min.txt"
## [4] "GSE4917_MCF10A-Myc_dex_4hr_vs_baseline_4hr.txt"
Session information
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] hgu133acdf_2.18.0 pd.hg.u133a_3.12.0 DBI_0.7
## [4] RSQLite_2.0 bindrcpp_0.2 dplyr_0.7.1
## [7] pander_0.6.0 devtools_1.13.3 Hmisc_4.0-3
## [10] Formula_1.2-2 survival_2.41-3 lattice_0.20-33
## [13] gplots_3.0.1 ggplot2_2.2.1 viridis_0.4.0
## [16] viridisLite_0.2.0 oligo_1.38.0 Biostrings_2.42.1
## [19] XVector_0.14.1 IRanges_2.8.2 S4Vectors_0.12.2
## [22] oligoClasses_1.36.0 GEOquery_2.40.0 Biobase_2.34.0
## [25] BiocGenerics_0.20.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_0.9-7
## [3] RColorBrewer_1.1-2 httr_1.2.1
## [5] rprojroot_1.2 GenomeInfoDb_1.10.3
## [7] tools_3.3.1 backports_1.1.0
## [9] R6_2.2.2 affyio_1.44.0
## [11] rpart_4.1-10 KernSmooth_2.23-15
## [13] lazyeval_0.2.0 colorspace_1.3-2
## [15] nnet_7.3-12 withr_2.0.0
## [17] gridExtra_2.2.1 bit_1.1-12
## [19] preprocessCore_1.36.0 htmlTable_1.9
## [21] labeling_0.3 caTools_1.17.1
## [23] scales_0.4.1 checkmate_1.8.3
## [25] stringr_1.2.0 digest_0.6.12
## [27] foreign_0.8-66 rmarkdown_1.6
## [29] base64enc_0.1-3 pkgconfig_2.0.1
## [31] htmltools_0.3.6 htmlwidgets_0.9
## [33] rlang_0.1.1 BiocInstaller_1.24.0
## [35] bindr_0.1 gtools_3.5.0
## [37] acepack_1.4.1 RCurl_1.95-4.8
## [39] magrittr_1.5 Matrix_1.2-6
## [41] Rcpp_0.12.11 munsell_0.4.3
## [43] stringi_1.1.5 yaml_2.1.14
## [45] SummarizedExperiment_1.4.0 zlibbioc_1.20.0
## [47] plyr_1.8.4 grid_3.3.1
## [49] affxparser_1.46.0 blob_1.1.0
## [51] gdata_2.18.0 splines_3.3.1
## [53] knitr_1.16 GenomicRanges_1.26.4
## [55] codetools_0.2-14 XML_3.98-1.9
## [57] glue_1.1.1 evaluate_0.10.1
## [59] latticeExtra_0.6-28 data.table_1.10.4
## [61] foreach_1.4.3 gtable_0.2.0
## [63] assertthat_0.2.0 ff_2.2-13
## [65] tibble_1.3.3 iterators_1.0.8
## [67] AnnotationDbi_1.36.2 memoise_1.1.0
## [69] cluster_2.0.4